Introduction

Air France: International growth (60% of airline revenues), increasing market share in U.S. market. In 2006, $9.4 billion in the SEM channel.

Media Contacts: Optimizing future campaigns performance. Return on advertising (ROA) for Search Engine Marketing (SEM) campaigns

Goal of this document is to provide insightful charts and tables to give recommendations to AF marketing team that will improve marketing strategy to bring more revenue to the company.

Libraries used for the analysis

library(readxl)
library(ggplot2)
library(plotly) 
library(data.table) 
library(dplyr)
library(tidyr)      # library for gather function
library(gridExtra)  # library for binding the 2 plots in 1 grid
library(grid)       # library for binding the 2 plots in 1 grid

Importing the Air France dataset into R

my_af <- read_excel("/Users/elayenikoylu/Desktop/R/Team Project/Air France Case Spreadsheet Supplement.xls")

Viewing the dataset

glimpse(my_af)
## Rows: 4,510
## Columns: 23
## $ `Publisher ID`             <chr> "K2615", "K2615", "K2003", "K1175", "K1123…
## $ `Publisher Name`           <chr> "Yahoo - US", "Yahoo - US", "MSN - Global"…
## $ `Keyword ID`               <chr> "43000000039657988", "43000000039651113", …
## $ Keyword                    <chr> "fly to florence", "low international airf…
## $ `Match Type`               <chr> "Advanced", "Advanced", "Broad", "Exact", …
## $ Campaign                   <chr> "Western Europe Destinations", "Geo Target…
## $ `Keyword Group`            <chr> "Florence", "Low International DC", "Franc…
## $ Category                   <chr> "uncategorized", "uncategorized", "uncateg…
## $ `Bid Strategy`             <chr> NA, NA, "Position 2-5 Bid Strategy", "Posi…
## $ `Keyword Type`             <chr> "Unassigned", "Unassigned", "Unassigned", …
## $ Status                     <chr> "Live", "Paused", "Deactivated", "Unavaila…
## $ `Search Engine Bid`        <dbl> 6.2500, 6.2500, 0.0000, 7.5000, 0.2500, 0.…
## $ Clicks                     <dbl> 1, 1, 1, 59, 8, 42, 3, 47, 13, 19, 29, 273…
## $ `Click Charges`            <dbl> 2.3125, 0.6250, 0.3875, 2.3125, 2.2000, 5.…
## $ `Avg. Cost per Click`      <dbl> 2.31250000, 0.62500000, 0.38750000, 0.0391…
## $ Impressions                <dbl> 11, 6, 9, 401, 318, 722, 13, 547, 448, 129…
## $ `Engine Click Thru %`      <dbl> 9.0909091, 16.6666667, 11.1111111, 14.7132…
## $ `Avg. Pos.`                <dbl> 1.2727273, 1.0000000, 1.1111111, 2.0000000…
## $ `Trans. Conv. %`           <dbl> 900.000000, 100.000000, 100.000000, 3.3898…
## $ `Total Cost/ Trans.`       <dbl> 0.2569444, 0.6250000, 0.3875000, 1.1562499…
## $ Amount                     <dbl> 8777.95, 1574.20, 390.15, 1665.15, 935.00,…
## $ `Total Cost`               <dbl> 2.3125, 0.6250, 0.3875, 2.3125, 2.2000, 5.…
## $ `Total Volume of Bookings` <dbl> 9, 1, 1, 2, 1, 2, 1, 2, 1, 1, 2, 10, 20, 2…

We take a glimpse of our dataset above to understand the data and variables we are dealing with.

Looking at the first 5 rows of the dataset

head(my_af)
## # A tibble: 6 x 23
##   `Publisher ID` `Publisher Name` `Keyword ID` Keyword `Match Type` Campaign
##   <chr>          <chr>            <chr>        <chr>   <chr>        <chr>   
## 1 K2615          Yahoo - US       43000000039… fly to… Advanced     Western…
## 2 K2615          Yahoo - US       43000000039… low in… Advanced     Geo Tar…
## 3 K2003          MSN - Global     43000000019… air di… Broad        Air Fra…
## 4 K1175          Google - Global  43000000005… [airfr… Exact        Air Fra…
## 5 K1123          Overture - Glob… 43000000005… air fr… Standard     Unassig…
## 6 K1123          Overture - Glob… 43000000005… airfra… Standard     Unassig…
## # … with 17 more variables: `Keyword Group` <chr>, Category <chr>, `Bid
## #   Strategy` <chr>, `Keyword Type` <chr>, Status <chr>, `Search Engine
## #   Bid` <dbl>, Clicks <dbl>, `Click Charges` <dbl>, `Avg. Cost per
## #   Click` <dbl>, Impressions <dbl>, `Engine Click Thru %` <dbl>, `Avg.
## #   Pos.` <dbl>, `Trans. Conv. %` <dbl>, `Total Cost/ Trans.` <dbl>,
## #   Amount <dbl>, `Total Cost` <dbl>, `Total Volume of Bookings` <dbl>

Checking the head of our data also helps us understand it.

Finding the class of the dataset

class(my_af)
## [1] "tbl_df"     "tbl"        "data.frame"

Above we are checking the class of our dataset. As we can see the output gives us a kind of mixed ideas about the class. So, later it’s better if set it as a data frame.

Checking all the variable names

colnames(my_af)
##  [1] "Publisher ID"             "Publisher Name"          
##  [3] "Keyword ID"               "Keyword"                 
##  [5] "Match Type"               "Campaign"                
##  [7] "Keyword Group"            "Category"                
##  [9] "Bid Strategy"             "Keyword Type"            
## [11] "Status"                   "Search Engine Bid"       
## [13] "Clicks"                   "Click Charges"           
## [15] "Avg. Cost per Click"      "Impressions"             
## [17] "Engine Click Thru %"      "Avg. Pos."               
## [19] "Trans. Conv. %"           "Total Cost/ Trans."      
## [21] "Amount"                   "Total Cost"              
## [23] "Total Volume of Bookings"

Checking the column names to see if there are any inconsistencies. Referring this, we will make some changes in terms of consistency.

Exploring the summary statistics of each variable

summary(my_af)
##  Publisher ID       Publisher Name      Keyword ID          Keyword         
##  Length:4510        Length:4510        Length:4510        Length:4510       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##   Match Type          Campaign         Keyword Group        Category        
##  Length:4510        Length:4510        Length:4510        Length:4510       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Bid Strategy       Keyword Type          Status          Search Engine Bid
##  Length:4510        Length:4510        Length:4510        Min.   : 0.000   
##  Class :character   Class :character   Class :character   1st Qu.: 3.384   
##  Mode  :character   Mode  :character   Mode  :character   Median : 6.250   
##                                                           Mean   : 5.435   
##                                                           3rd Qu.: 6.250   
##                                                           Max.   :27.500   
##      Clicks        Click Charges      Avg. Cost per Click  Impressions     
##  Min.   :    0.0   Min.   :    0.00   Min.   : 0.000      Min.   :      0  
##  1st Qu.:    1.0   1st Qu.:    2.31   1st Qu.: 0.825      1st Qu.:     28  
##  Median :    4.0   Median :    6.76   Median : 1.650      Median :    176  
##  Mean   :  113.7   Mean   :  167.48   Mean   : 1.890      Mean   :   9284  
##  3rd Qu.:   19.0   3rd Qu.:   28.49   3rd Qu.: 2.663      3rd Qu.:    844  
##  Max.   :34012.0   Max.   :46188.44   Max.   :10.000      Max.   :8342415  
##  Engine Click Thru %   Avg. Pos.      Trans. Conv. %     Total Cost/ Trans.
##  Min.   :  0.000     Min.   : 0.000   Min.   :  0.0000   Min.   :   0.00   
##  1st Qu.:  1.532     1st Qu.: 1.143   1st Qu.:  0.0000   1st Qu.:   0.00   
##  Median :  4.106     Median : 1.594   Median :  0.0000   Median :   0.00   
##  Mean   : 11.141     Mean   : 1.930   Mean   :  0.5693   Mean   :  27.61   
##  3rd Qu.: 10.917     3rd Qu.: 2.308   3rd Qu.:  0.0000   3rd Qu.:   0.00   
##  Max.   :200.000     Max.   :15.000   Max.   :900.0000   Max.   :9597.17   
##      Amount         Total Cost       Total Volume of Bookings
##  Min.   :     0   Min.   :    0.00   Min.   :  0.0000        
##  1st Qu.:     0   1st Qu.:    2.31   1st Qu.:  0.0000        
##  Median :     0   Median :    6.76   Median :  0.0000        
##  Mean   :  1034   Mean   :  167.48   Mean   :  0.8734        
##  3rd Qu.:     0   3rd Qu.:   28.49   3rd Qu.:  0.0000        
##  Max.   :567463   Max.   :46188.44   Max.   :439.0000

Above displaying summary statistics here to better understand the numerical variables. It’ll help us when digging deeper.

Data Massaging

In this section, we will be manipulating(massaging) the data to make it more convenient for our analysis.

Col names transformation for consistency

Setting " " as "_" in col names for the future analysis

names(my_af) <- gsub(" ", "_", names(my_af))
colnames(my_af)
##  [1] "Publisher_ID"             "Publisher_Name"          
##  [3] "Keyword_ID"               "Keyword"                 
##  [5] "Match_Type"               "Campaign"                
##  [7] "Keyword_Group"            "Category"                
##  [9] "Bid_Strategy"             "Keyword_Type"            
## [11] "Status"                   "Search_Engine_Bid"       
## [13] "Clicks"                   "Click_Charges"           
## [15] "Avg._Cost_per_Click"      "Impressions"             
## [17] "Engine_Click_Thru_%"      "Avg._Pos."               
## [19] "Trans._Conv._%"           "Total_Cost/_Trans."      
## [21] "Amount"                   "Total_Cost"              
## [23] "Total_Volume_of_Bookings"

Making all col names consistent / Removing “%” sign

colnames(my_af)[which(names(my_af) == "Total_Cost/_Trans.")] <- "Total_Cost_Each_Trans"
colnames(my_af)[which(names(my_af) == "Engine_Click_Thru_%")] <- "Engine_Click_Through_Prct"
colnames(my_af)[which(names(my_af) == "Trans._Conv._%")] <- "Trans_Conv_Prct"
colnames(my_af)[which(names(my_af) == "Avg._Pos.")] <- "Avg_Pos"
colnames(my_af)[which(names(my_af) == "Avg._Cost_per_Click")] <- "Avg_Cost_Each_Click"
colnames(my_af)
##  [1] "Publisher_ID"              "Publisher_Name"           
##  [3] "Keyword_ID"                "Keyword"                  
##  [5] "Match_Type"                "Campaign"                 
##  [7] "Keyword_Group"             "Category"                 
##  [9] "Bid_Strategy"              "Keyword_Type"             
## [11] "Status"                    "Search_Engine_Bid"        
## [13] "Clicks"                    "Click_Charges"            
## [15] "Avg_Cost_Each_Click"       "Impressions"              
## [17] "Engine_Click_Through_Prct" "Avg_Pos"                  
## [19] "Trans_Conv_Prct"           "Total_Cost_Each_Trans"    
## [21] "Amount"                    "Total_Cost"               
## [23] "Total_Volume_of_Bookings"

Class-Type check

Setting my_af as a data frame

my_af <- as.data.frame(my_af)

Setting characters as factors

my_af[sapply(my_af, is.character)] <- lapply(my_af[sapply(my_af, is.character)], 
                                             as.factor)

Missing Value Detection / Replacement

Detecting the number of missing values

sum(is.na(my_af))
## [1] 1224

Removing missing values

my_af_clean <- na.omit(my_af) 

Creating additional data frames and new variables for the future use

Creating an additional data frame without $0s of Booking Value

my_nonzero_clean <- my_af_clean[my_af_clean$Total_Volume_of_Bookings > 0, ]
my_nonzero <- my_af[my_af$Total_Volume_of_Bookings > 0, ]

Creating a new variable called ROA for my original dataset

my_af$Profit <- my_af$Amount - my_af$Total_Cost
my_af$ROA <- my_af$Profit / my_af$Total_Cost

Creating the same variables for my original clean dataset

my_af_clean$Profit <- my_af_clean$Amount - my_af_clean$Total_Cost
my_af_clean$ROA <- my_af_clean$Profit / my_af_clean$Total_Cost

Creating the same variables for my nonzero dataset

my_nonzero$Profit <- my_nonzero$Amount - my_nonzero$Total_Cost
my_nonzero$ROA <- my_nonzero$Profit / my_nonzero$Total_Cost

Creating the same variables for my nonzero and clean dataset

my_nonzero_clean$Profit <- my_nonzero_clean$Amount - my_nonzero_clean$Total_Cost
my_nonzero_clean$ROA <- my_nonzero_clean$Profit / my_nonzero_clean$Total_Cost

Categorizing Keyword_Group and create new variable called “new_cat”

rowsinmy <- nrow(my_nonzero_clean) # getting row indexes to use in my loop
my_nonzero_clean$new_cat <- NA # Creating an empty variable

for (i in 1:rowsinmy){    # Looping the catch keywords to categorize them in 4
  if(my_nonzero_clean$Keyword_Group[i] %like% "Air France" ) {
    my_nonzero_clean$new_cat[i] <- "Branded Only"
  } else if (my_nonzero_clean$Keyword_Group[i] %like% "Branded") {
    my_nonzero_clean$new_cat[i] <- "Geo Branded"
  } else if (my_nonzero_clean$Keyword_Group[i] %like% "Sale") {
    my_nonzero_clean$new_cat[i] <- "Branded Only"
  } else if (my_nonzero_clean$Keyword_Group[i] %like% "Discount"){
    my_nonzero$new_cat[i] <- "Branded Only"  
  } else if (my_nonzero_clean$Keyword_Group[i] %like% "Unassigned"){
    my_nonzero_clean$new_cat[i] <- "Others"
  }else {
    my_nonzero_clean$new_cat[i] <- "Geo Only"
  }
} # Closing the i-loop

table(my_nonzero_clean$new_cat) # Checking my new variable
## 
## Branded Only  Geo Branded     Geo Only       Others 
##           70            3          103           79

Before start plotting our findings, we would like to set a theme for consistency

theme_my_af <- function(){ 
  theme_minimal() +
    theme(
      text = element_text(color = "gray25"),
      plot.subtitle = element_text(size = 12),
      plot.caption = element_text(color = "gray30"),
      plot.background = element_rect(fill = "gray95"),
      plot.margin = unit(c(5, 10, 5, 10), units = "mm")
    )
}

Tables and Plots

Data frames I will use to create tables and plots

my_af # Original dataset
my_af_clean # Original dataset without NAs
my_nonzero # Nonzero dataset with NAs
my_nonzero_clean # Nonzero dataset without NAs

Plot for Publisher Name vs ROA

ggplot(my_table4, aes(x = reorder(Publisher_Name, -avgROA), y = avgROA), fill = Publisher_Name) + 
  geom_col(fill = "lightblue") + 
  labs(x = "Publisher_Name", y = "ROA (Average)", title = "Publishers by Average ROA (Return on Advertisement)") + 
  theme_my_af() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Table for Publisher Name vs average Click Thru and average Conversion Rate

my_table5 <- my_af %>%
  group_by(Publisher_Name) %>%
  summarize(avgClickThrough = mean(Engine_Click_Through_Prct), avgConv = mean(Trans_Conv_Prct)) %>%
  arrange(desc(avgClickThrough)) %>%
  head(10)

my_table5
## # A tibble: 7 x 3
##   Publisher_Name    avgClickThrough avgConv
##   <fct>                       <dbl>   <dbl>
## 1 Yahoo - US                  16.1   1.83  
## 2 Google - US                 15.1   0.415 
## 3 Google - Global              8.99  0.433 
## 4 MSN - US                     8.26  0.731 
## 5 MSN - Global                 7.34  1.13  
## 6 Overture - Global            3.55  0.236 
## 7 Overture - US                2.67  0.0949

Plot for Publisher Name vs Amount

my_plot2 <- ggplot(data = my_af, aes(x = Publisher_Name,
                                     y= Amount)) +
  geom_point(aes(alpha = 1), shape = 5) + labs(title = "Keywords - Amount vs Publisher Name",
                                               x = "Publisher Name", y = "Amount") + theme_my_af()

ggplotly(my_plot2) #After plotting, zoom in the outlier in Yahoo US to point out that it does better than Google US

Table for Match_Type

my_table2 <- my_af %>%
  group_by(Match_Type) %>%
  summarize(avgClick = round(mean(Clicks)), avgCost = mean(Avg_Cost_Each_Click), avgConv = mean(Trans_Conv_Prct),
            avgBook = mean(Total_Volume_of_Bookings), avgcostpertrans =mean(Total_Cost_Each_Trans), profit = mean(Profit), roa = mean(ROA)) %>%
  arrange(desc(avgBook)) %>%
  head(10)

my_table2
## # A tibble: 5 x 8
##   Match_Type avgClick avgCost avgConv avgBook avgcostpertrans  profit    roa
##   <fct>         <dbl>   <dbl>   <dbl>   <dbl>           <dbl>   <dbl>  <dbl>
## 1 Exact          2566   1.66   0.364  38.6              5.70  44743.   36.5 
## 2 Advanced        120   1.50   1.23    1.02            26.7    1176.    7.80
## 3 Broad            89   2.40   0.464   0.680           31.4     583.    1.77
## 4 Standard        125   0.870  0.189   0.382           19.5     304.    2.61
## 5 N/A              25   1.16   0.0611  0.0625           0.889    80.9 Inf
# Just a modify version of the table above for presenting
my_table3 <- my_af %>%
  group_by(Match_Type) %>%
  summarize(avgClick = round(mean(Clicks)), avgConv = mean(Trans_Conv_Prct),
            avgBook = mean(Total_Volume_of_Bookings), avgcostpertrans =mean(Total_Cost_Each_Trans), profit = mean(Profit), roa = mean(ROA)) %>%
  arrange(desc(avgBook)) %>%
  head(10)

my_table3
## # A tibble: 5 x 7
##   Match_Type avgClick avgConv avgBook avgcostpertrans  profit    roa
##   <fct>         <dbl>   <dbl>   <dbl>           <dbl>   <dbl>  <dbl>
## 1 Exact          2566  0.364  38.6              5.70  44743.   36.5 
## 2 Advanced        120  1.23    1.02            26.7    1176.    7.80
## 3 Broad            89  0.464   0.680           31.4     583.    1.77
## 4 Standard        125  0.189   0.382           19.5     304.    2.61
## 5 N/A              25  0.0611  0.0625           0.889    80.9 Inf

Table for Match Type with averages

my_table1 <- my_af_clean %>%
  group_by(Match_Type) %>%
  summarize(avgClick = round(mean(Clicks)), avgCost = mean(Avg_Cost_Each_Click), avgConv = mean(Trans_Conv_Prct),
            avgBook = mean(Total_Volume_of_Bookings)) %>%
  arrange(desc(avgBook)) %>%
  head(10)

my_table1
## # A tibble: 4 x 5
##   Match_Type avgClick avgCost avgConv avgBook
##   <fct>         <dbl>   <dbl>   <dbl>   <dbl>
## 1 Exact           566   1.76    0.249   9    
## 2 Advanced        174   0.737   0.181   0.866
## 3 Broad            63   2.39    0.431   0.363
## 4 Standard        148   0.672   0.157   0.336

Plot for Match Type by Average Clicks vs Average Conversion Rate

plot_ly(my_table1, x = ~avgConv, y = ~avgClick,
        textposition = "auto",
        type = 'scatter', 
        mode = 'markers', 
        size = ~avgClick/avgConv, 
        color = ~Match_Type, 
        colors = 'Paired',
        marker = list(opacity = 0.8, sizemode = 'diameter')) %>%
  layout(title = 'Match Type by Average Clicks vs Conversion',
         xaxis = list(title = "Average Conversion Rate", showgrid = TRUE),
         yaxis = list(title = "Average Clicks", showgrid = TRUE),
         showlegend = TRUE) 

Table for Campaign vs ROA

my_table_camp <- my_af %>%
  group_by(Campaign) %>%
  summarize(avgROA = mean(ROA)) %>%
  arrange(desc(avgROA)) %>%
  tail(23)

my_table_camp
## # A tibble: 23 x 2
##    Campaign                               avgROA
##    <fct>                                   <dbl>
##  1 Air France Branded                      30.5 
##  2 Geo Targeted DC                         17.7 
##  3 Western Europe Destinations              9.02
##  4 Air France Brand & French Destinations   5.41
##  5 Air France Global Campaign               4.86
##  6 Unassigned                               2.71
##  7 Geo Targeted Miami                       2.27
##  8 General Terms                            2.26
##  9 Geo Targeted San Francisco               2.00
## 10 Geo Targeted New York                    1.53
## # … with 13 more rows

Plot for Campaign vs ROA

ggplot(my_table_camp, aes(x = reorder(Campaign, -avgROA), y = avgROA)) + 
  geom_col(fill = "pink") + 
  labs(x = "Campaign", y = "ROA (Average)", title = "ROA by Campaign") + 
  theme_my_af() + 
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Keyword Group Plot

my_plot1 <- ggplot(data = my_nonzero_clean, aes(x = my_nonzero_clean$Engine_Click_Through_Prct ,
                                          y= my_nonzero_clean$Trans_Conv_Prct , color = new_cat)) +
  geom_point(aes(alpha = 1), shape = 5) + labs(title = "Keywords - Click Through vs Conversion",
                                               x = "Conversion Rate", y = "Click Through Rate") + theme_my_af()

ggplotly(my_plot1)

Table for Status = “Live” & "Paused

# Live
Live_status <- my_af %>%
  group_by(Publisher_Name) %>%
  filter(Status == "Live") %>%
  summarize(avgROA = mean(ROA), avgClickThru = mean(Engine_Click_Through_Prct),avgConv = mean(Trans_Conv_Prct)) %>%
  arrange(desc(avgROA)) %>%
  head(10)

# Paused
Paused_status <- my_af %>%
  group_by(Publisher_Name) %>%
  filter(Status == "Paused") %>%
  summarize(avgROA = mean(ROA), avgClickThru = mean(Engine_Click_Through_Prct),avgConv = mean(Trans_Conv_Prct)) %>%
  arrange(desc(avgROA)) %>%
  head(10)

Defininf two plots into names for the combined plot about Status

## Live_Status: Filters with Publisher name
clustered_live <- gather(Live_status, type, value, -Publisher_Name)

LivePlot <- ggplot(clustered_live, aes(type, value)) + 
  geom_bar(aes(fill = Publisher_Name), stat = "identity", position = "dodge")+
  theme_my_af()+
  labs(x = "Parameters",title = "Status: LIVE")

## Paused_Status: Filters with Publisher name
clustered_paused <- gather(Paused_status, type , value, -Publisher_Name)

PausedPlot <- ggplot(clustered_paused, aes(type, value)) + 
  geom_bar(aes(fill = Publisher_Name), stat = "identity", position = "dodge")+
  theme_my_af()+
  labs(x = "Parameters",title = "Status: PAUSED")

Plotting 2 graphs in the same grid

grid.arrange(LivePlot, PausedPlot, nrow = 1)